///\file cu_convolution.cu
///\brief contient les implémentations de la classe CUDA::Convolution
///\author { V.Allombert, A.Blanchard, A.Carteron, V.Pelletier } 
#include "cu_convolution.hh"
#include "sizor.hh"
#include <iostream>

namespace CUDA{
  //Déclaration de la zone de mémoire texture
  texture <unsigned char, 2, cudaReadModeElementType> texRef;

  //Déclaration des zones de mémoire shared
  __shared__ int m_convo[169];
  __shared__ pixel tmp_out[NB_THREADS][SHARED_LENGTH];
  
  //Fonction de convolution kernel
  __global__ void convolution(pixel* out, 
			      unsigned int h, 
			      unsigned int w, 
			      int* matrix,
			      unsigned int dim,
			      unsigned int divider
			      ){
  
    //Réception des données dans la shared 
    for(unsigned int i = threadIdx.x; i < dim*dim; i += blockDim.x)
      m_convo[i] = matrix[i];
    
    __syncthreads();

    unsigned int j = dim/2;
    unsigned int stride = (blockIdx.x*blockDim.x)+threadIdx.x;
    
    int r, g, b, a;
    unsigned int y;
    unsigned int decout = (w - 2*(dim/2));
    h -= dim/2;

    do{
      //Pour chaque élément de la shared
      for(unsigned int i = 0; i < SHARED_LENGTH && j+i < h; i++){
	r = 0;
	g = 0;
	b = 0;
	a = 0;
	
	y = j+i - dim/2;

	//Calcul de chaque éléments de la zone
	for(unsigned int l = 0; l < dim; l++)  
	  for(unsigned int k = 0; k < dim; k++){
	    r += tex2D(texRef, (stride+k)*sizeof(pixel),   y+l)
	      *m_convo[l*dim+k];
	    g += tex2D(texRef, (stride+k)*sizeof(pixel)+1, y+l)
	      *m_convo[l*dim+k];
	    b += tex2D(texRef, (stride+k)*sizeof(pixel)+2, y+l)
	      *m_convo[l*dim+k];
	    a += tex2D(texRef, (stride+k)*sizeof(pixel)+3, y+l)
	      *m_convo[l*dim+k];
	  }
	
	//Ecriture des résultats temporairement
	tmp_out[threadIdx.x][i].r = (r > 0)*(r/divider);
	tmp_out[threadIdx.x][i].g = (g > 0)*(g/divider);
	tmp_out[threadIdx.x][i].b = (b > 0)*(b/divider);
	tmp_out[threadIdx.x][i].a = (a > 0)*(a/divider);
      }
      
      //Ecriture des resultats
      for(unsigned int i = 0; i < SHARED_LENGTH && j < h; i++, j++)
	out[(j-dim/2)*decout + stride] = tmp_out[threadIdx.x][i];

    } while(j < h); //Dans les limites de la taille
  }
  


  Convolution::Convolution(const Matrix<int> &m, unsigned int d): Generic::Convolution(m,d) {}

  void Convolution::convol(const Image& input, Image& output) const{
    //Découpage
    CUDA::Sizor sizor(input,m);

    cudaError_t err;

    //*
    //host page-locked mem
    pixel* plmem[2];  
    err = cudaMallocHost(&plmem[0], (sizor.getMaxWidth(CUDA::IN)*sizor.getMaxHeight(CUDA::IN))*sizeof(pixel));
    if(err)
      std::cerr<<"PL1 :"<<cudaGetErrorString(err)<<std::endl;

    err = cudaMallocHost(&plmem[1], (sizor.getMaxWidth(CUDA::IN)*sizor.getMaxHeight(CUDA::IN))*sizeof(pixel));
    if(err)
      std::cerr<<"PL2 :"<<cudaGetErrorString(err)<<std::endl;


    //GPU cuArray
    cudaArray* gpmemIN[2];
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
    err = cudaMallocArray(&gpmemIN[0], &channelDesc,
		    sizor.getMaxWidth(CUDA::IN)*sizeof(pixel),
		    sizor.getMaxHeight(CUDA::IN));
    if(err)
      std::cerr<<"GPI1 :"<<cudaGetErrorString(err)<<"\n"<<
	sizor.getMaxWidth(CUDA::IN)*sizeof(pixel)<<"\n"<<sizor.getMaxHeight(CUDA::IN)<<std::endl;

    err = cudaMallocArray(&gpmemIN[1], &channelDesc,
		    sizor.getMaxWidth(CUDA::IN)*sizeof(pixel),
		    sizor.getMaxHeight(CUDA::IN));
    if(err)
      std::cerr<<"GPI2 :"<<cudaGetErrorString(err)<<std::endl;

    int* gpumatrix;
    cudaMalloc(&gpumatrix, m.width()*m.height()*sizeof(int));

    cudaMemcpy(gpumatrix, &m[0][0], 
	       m.width()*m.height()*sizeof(int),
	       cudaMemcpyHostToDevice);

    //GPU out
    pixel* gpmemOUT[2];
    err = cudaMalloc(&gpmemOUT[0], (sizor.getMaxWidth(CUDA::OUT)*sizor.getMaxHeight(CUDA::OUT))*sizeof(pixel));
    if(err)
      std::cerr<<"GPO1 :"<<cudaGetErrorString(err)<<std::endl;
    err = cudaMalloc(&gpmemOUT[1], (sizor.getMaxWidth(CUDA::OUT)*sizor.getMaxHeight(CUDA::OUT))*sizeof(pixel));
    if(err)
      std::cerr<<"GPO2 :"<<cudaGetErrorString(err)<<std::endl;

    //*/
    // ==== Traitements par stream =================================== //
    cudaStream_t stream[2];

    //Initialisation des streams
    for(unsigned int i = 0; i < 2; i++)
      cudaStreamCreate(&stream[i]);
 
    int chunk = 0;
    int block_size = (sizor.getMaxWidth(CUDA::IN)*sizor.getMaxHeight(CUDA::IN));

    //Tant qu'il reste des blocks a traiter
    while(sizor.has_next()){
      {
	unsigned int i = 0;
	for(; i < sizor.getCurrentHeight(CUDA::IN); i++){
	  cudaMemcpyAsync(&plmem[chunk%2][i*sizor.getMaxWidth(CUDA::IN)], 
			  &input[sizor.current().byi+i][sizor.current().bxi],
			  sizor.getCurrentWidth(CUDA::IN)*sizeof(pixel),
			  cudaMemcpyHostToHost, stream[chunk%2]);
	}
	/* GAIN de non recopie
	for(; i < sizor.getMaxHeight(CUDA::IN); i++){
	  cudaMemcpyAsync(&plmem[chunk%2][i*sizor.getMaxWidth(CUDA::IN)], 
			  &input[sizor.current().byi+sizor.getCurrentHeight(CUDA::IN)-1][sizor.current().bxi],
			  sizor.getCurrentWidth(CUDA::IN)*sizeof(pixel),
			  cudaMemcpyHostToHost, stream[chunk%2]);
	}
	*/
      }
      
      cudaMemcpyToArrayAsync(gpmemIN[chunk%2], 0, 0, plmem[chunk%2], 
			     block_size*sizeof(pixel), cudaMemcpyHostToDevice,
			     stream[chunk%2]);
    
      //bind
      cudaBindTextureToArray(texRef, gpmemIN[chunk%2], channelDesc);
      
      //KERNEL
      convolution<<<MPS,NB_THREADS,0,stream[chunk%2]>>>(gpmemOUT[chunk%2],
						sizor.getCurrentHeight(CUDA::IN),
						sizor.getMaxWidth(CUDA::IN), 
						gpumatrix,
						m.width(),
						divider);

      //Unbind
      cudaUnbindTexture(texRef);
      
      for(unsigned int i=0; i<sizor.getCurrentHeight(CUDA::OUT);++i){
	cudaMemcpyAsync(&output[sizor.current().byo+i][sizor.current().bxo],
			&gpmemOUT[chunk%2][i*sizor.getMaxWidth(CUDA::OUT)],
			sizor.getCurrentWidth(CUDA::OUT)*sizeof(pixel),
			cudaMemcpyDeviceToHost,stream[chunk%2]);
      }
      
      chunk++;
      sizor.next();
    }

    for(unsigned int i = 0; i < 2; i++)
      cudaStreamDestroy(stream[i]);
    //*/

    //libérations mémoire
    //*
    cudaFreeHost(&plmem[0]);
    cudaFreeHost(&plmem[1]);

    cudaFreeArray(gpmemIN[0]);
    cudaFreeArray(gpmemIN[1]);

    cudaFree(gpmemOUT[0]);
    cudaFree(gpmemOUT[1]);
    //*/
  }

  
}
